Loading the main libraries

library(rvest)
library(xml2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
url <- "https://www.tomaticket.es/index.php?accion=search&buscador=&IdLugar=0&IdTag=4"

tomaticket <- read_html(url) |> xml_child()

Data Cleaning and Final Data Frame Creation

df_theaters <- data.frame(theater = theaters)
df_shows <- data.frame(show_links)

clean_dates <- sub("^[^=]+=[^=]+=(.*)$", "\\1", datetime)
clean_dates <- sub(".*\"(\\d{4}-\\d{2}-\\d{2}).*", "\\1", clean_dates)
dates <- as.Date(clean_dates)

df_date <- data.frame(dates)
df_date$id <- c(1:100)

df_theaters$theater_name <- sub(".*/recintos/(.*)$", "\\1", df_theaters$theater)
df_theaters <- df_theaters %>% 
  select(-theater)

df_theaters$id <- c(1:100)
final_df <- left_join(df_theaters, df_date, by = "id")

df_shows$show <- sub(".*/entradas-(.*)", "\\1", df_shows$show_links)
df_shows <- df_shows %>% 
  select(-show_links)
df_shows$id <- c(1:100)
final_df <- left_join(final_df, df_shows, by = "id")

location_address <- grep('address', location, value = TRUE)
clean_location <- sub('.*="([^"]*)".*', '\\1', location_address)
df_location <- data.frame(clean_location)
df_location$id <- c(1:100)
final_df <- left_join(final_df, df_location, by = "id")

clean_prices <- sub('.*="([^"]*)".*', '\\1', only_prices)
df_prices <- data.frame(clean_prices)
df_prices$id <- c(1:100)
df_prices$clean_prices <- gsub(",", ".", df_prices$clean_prices)

df_prices$clean_prices <- as.numeric(df_prices$clean_prices)
df_prices$clean_prices <- round(df_prices$clean_prices, 2)
final_df <- left_join(final_df, df_prices, by = "id")


final_df$show <- gsub("-", " ", final_df$show)
final_df$theater_name <- gsub("-", " ", final_df$theater_name)

current_date <- as.Date("2024-03-22", format = "%Y-%m-%d")
current_date <- rep(current_date, times = 100)
df_current_date <- data.frame(current_date)
df_current_date$id <- c(1:100)

final_df <- left_join(final_df, df_current_date, by = "id")
final_df <- final_df %>% 
  select(id, everything())

final_df <- final_df %>% 
  rename(location = clean_location, prices = clean_prices)

final_df <- final_df %>% 
  relocate(dates, .after = prices)

final_df <- final_df %>% 
  mutate(days_elapsed = difftime(current_date, dates))

final_df <- left_join(final_df, df_1, by  = "id")


final_df$days_elapsed <- as.numeric(final_df$days_elapsed, units = "days")



#write.csv(final_df, "final_df.csv", row.names = FALSE) # I have saved df_final as a database in case Tomaticket removes any links that might prevent us from retrieving that observation again

final_df <- read_csv("final_df.csv")
## Rows: 100 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): theater_name, show, location
## dbl  (5): id, prices, days_elapsed, Latitud, Longitud
## date (2): dates, current_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#I remove observations 93 and 98 because the theater does not correspond to the show (in Tomaticket)
final_df <- final_df %>%
  slice(-93, -98)

final_df
## # A tibble: 98 × 10
##       id theater_name show  location prices dates      current_date days_elapsed
##    <dbl> <chr>        <chr> <chr>     <dbl> <date>     <date>              <dbl>
##  1     1 teatros luc… dobl… Madrid     18   2021-05-07 2024-03-22           1050
##  2     2 teatre cond… yo s… Barcelo…   17   2022-06-17 2024-03-22            644
##  3     3 teatro enca… magi… Madrid     15   2022-08-27 2024-03-22            573
##  4     4 pequeno tea… cort… Madrid     20   2022-08-31 2024-03-22            569
##  5     5 off latina   impo… Madrid      6.5 2022-09-03 2024-03-22            566
##  6     6 teatro arle… la m… Madrid     12   2022-09-26 2024-03-22            543
##  7     7 gran teatro… davi… Madrid     20   2022-10-08 2024-03-22            531
##  8     8 teatro arle… la a… Madrid     14   2022-10-12 2024-03-22            527
##  9     9 teatro capi… alex… Madrid     16   2022-10-22 2024-03-22            517
## 10    10 teatro figa… una … Madrid     16   2023-01-01 2024-03-22            446
## # ℹ 88 more rows
## # ℹ 2 more variables: Latitud <dbl>, Longitud <dbl>

Descriptive Analysis

  1. National Level
ggplot(data = final_df, aes(x = reorder(location, location, length))) +
  geom_bar(aes(fill = location)) +
  geom_text(stat='count', aes(label=..count..), vjust=-0.5) +
  labs(title = "Number of shows per city",
       x = "City",
       y = "Number of shows") +
  theme_minimal()

At Tomaticket, there are 98 theater productions available, of which 82 are in theaters in Madrid, 13 in Barcelona, and one in Granada, Málaga, and Valladolid.

ggplot(final_df, aes(x = days_elapsed)) +
  geom_density(fill = "skyblue", color = "navyblue") +
  labs(title = "Distribution of duration (available days)", x = "Days", y = "Density") +
  theme_minimal()

#Mean and mode
media <- mean(final_df$days_elapsed)
moda <- names(sort(-table(final_df$days_elapsed)))[1]
media
## [1] 255.3469
moda
## [1] "182"

We can observe that a large part of the theater productions are available in theaters for less than 250 days. On average, the shows advertised on Tomaticket are available in mean in each theater for about 256 days; however, the usual or typical availability is around 182 days (6 months).

avg_price_per_city <- final_df %>%
  group_by(location) %>%
  summarise(avg_price = mean(prices, na.rm = TRUE))

ggplot(avg_price_per_city, aes(x = reorder(location, avg_price), y = avg_price, fill = avg_price)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = sprintf("€%.2f", avg_price)), vjust = -0.5, color = "black", size = 3) +  # Añadir etiquetas de texto con los precios
  labs(title = "Average Price by Location",
       x = "Location",
       y = "Average price") +
  scale_fill_gradient(low = "lightgreen", high = "orange") +  # Escala de color de menos a más precio
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Valladolid has the highest average price (€22) for the theater productions available on Tomaticket, followed by Barcelona (€15.53) and Madrid (€14.15). On the other hand, the cities with the cheapest theater productions are Granada and Málaga, with prices below €10.50.

However, for Valladolid, Granada, and Málaga, we only have one theater production available. This means that the average price for those cities will be equal to the price of that single observation.

In practical terms, this may result in a biased representation of the average price for those cities, as the price of a single observation may not be representative of the price distribution in that particular city. Therefore, it is important to consider this bias when interpreting the results of the analysis.

Therefore, we will focus our analysis on the theater productions advertised on Tomaticket only for Madrid, and we will compare between theaters.

  1. Analyzing Madrid´s theaters
data_madrid <- final_df %>% 
  filter(location == "Madrid")
precio_medio_por_teatro <- data_madrid %>%
  group_by(theater_name) %>%
  summarize(precio_medio = mean(prices))




ggplot(precio_medio_por_teatro, aes(x = reorder(theater_name, precio_medio), y = precio_medio)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Price per Theater",
       x = "Theater",
       y = "Average Price") +
  theme(axis.text.x = element_text(angle = 40, size = 6, hjust = 1))

Let´s see the most expensive and the cheapest theaters in Madrid

data_madrid_tp <- data_madrid %>%
  group_by(theater_name) %>%
  summarise(avg_price = round(mean(prices, na.rm = TRUE), 2)) # 27 theaters

Most expensive theaters:

data_madrid_tp %>% slice_max(avg_price, n = 6)
## # A tibble: 6 × 2
##   theater_name                              avg_price
##   <chr>                                         <dbl>
## 1 gran teatro caixabank principe pio madrid      21  
## 2 teatro marquina                                18.2
## 3 taberna flamenca el cortijo madrid             18  
## 4 teatro alcazar madrid                          17.5
## 5 teatro reina victoria madrid                   17.5
## 6 pequeno teatro gran via madrid                 16.5

Cheapest theaters:

data_madrid_tp %>% slice_min(avg_price, n = 6)
## # A tibble: 6 × 2
##   theater_name               avg_price
##   <chr>                          <dbl>
## 1 iIntruso bar madrid             5   
## 2 wit comedy club madrid          7   
## 3 meltdown madrid                 8   
## 4 teatro bellas artes madrid      8   
## 5 off latina                      8.64
## 6 SOJO Laboratorio Teatral       11.0
obras_por_teatro <- data_madrid %>%
  group_by(theater_name) %>%
  summarize(num_obras = n())


ggplot(obras_por_teatro, aes(x = reorder(theater_name, num_obras), y = num_obras, fill = num_obras)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") + 
  labs(title = "Número de Obras por Teatro",
       x = "Teatro",
       y = "Número de Obras") +
  coord_flip() +
  geom_text(aes(label = num_obras), hjust = -0.2, size = 3)  

“Teatro Luchana” is the theater that hosts the most shows in Madrid (17), followed by “Teatro Lara” (10), “Teatro Arlequín” (9), and “Teatro Off Latina” (7).

largest_shows_madrid <- data_madrid %>%
  top_n(5, days_elapsed)
largest_shows_madrid
## # A tibble: 5 × 10
##      id theater_name  show  location prices dates      current_date days_elapsed
##   <dbl> <chr>         <chr> <chr>     <dbl> <date>     <date>              <dbl>
## 1     1 teatros luch… dobl… Madrid     18   2021-05-07 2024-03-22           1050
## 2     3 teatro encan… magi… Madrid     15   2022-08-27 2024-03-22            573
## 3     4 pequeno teat… cort… Madrid     20   2022-08-31 2024-03-22            569
## 4     5 off latina    impo… Madrid      6.5 2022-09-03 2024-03-22            566
## 5     6 teatro arleq… la m… Madrid     12   2022-09-26 2024-03-22            543
## # ℹ 2 more variables: Latitud <dbl>, Longitud <dbl>
ggplot(data = largest_shows_madrid,
                  aes(x = reorder(show, -days_elapsed),
                      y = days_elapsed)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(x = "Show", y = "Duración (días desde su fecha hasta ahora)", title = "Duración de los 5 shows más largos") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The show ‘Doble o Nada’ stands out significantly above the rest, with a run time of over 1000 days. The other shows closely following it have an approximate duration of 500 days each.

plot_ly(data = data_madrid, x = ~days_elapsed, y = ~prices, text = ~paste("Nombre del show: ", show, "<br>Duración: ", current_date - dates, " días<br>Precio: $", prices), hoverinfo = "text",
        type = 'scatter', mode = 'markers', marker = list(color = ~prices, colorscale = 'RdYlGn',
                                                         colorbar = list(title = "Precio", ticksuffix = "$"))) %>%
  layout(title = "Relación entre Días Activos del Show y Precio Medio",
         xaxis = list(title = "Días Activos del Show"),
         yaxis = list(title = "Precio Medio"))

In shows priced at €10 or higher, there is a certain positive relationship observed between the number of days the show is active and its price. Although it is not a very strong relationship.

teatros_con_mas_shows <- data_madrid %>%
  group_by(theater_name) %>%
  summarize(num_obras = n(),
            prices = mean(prices),
            days_elapsed = mean(days_elapsed)) %>%
  arrange(desc(num_obras))
data_clustering <- teatros_con_mas_shows  %>%
  select(theater_name, prices, num_obras, days_elapsed)

#normalization
data_clustering_norm <- data_clustering %>%
  select(-theater_name) %>%
  scale()

#K.means clustering
set.seed(123)
kmeans_model <- kmeans(data_clustering_norm, centers = 3)
teatros_con_mas_shows$cluster <- as.factor(kmeans_model$cluster)
grafico <- ggplot(teatros_con_mas_shows, aes(x = prices, y = days_elapsed, color = cluster, 
                                             text = paste("Teatro:", theater_name, "<br>Precio Medio:", prices, "<br>Days Elapsed:", days_elapsed))) +
  geom_point() +
  stat_ellipse(type = "norm", level = 0.95, linetype = "dashed") +
  labs(title = "Clustering de Teatros",
       x = "Precio Medio",
       y = "Days elapsed") +
  scale_color_manual(values = c("blue", "green", "purple"))


grafico_interactivo <- ggplotly(grafico, tooltip = "text") %>%
  plotly::layout(hovermode = "closest")
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
grafico_interactivo

We see that we could group the 27 theaters offering shows on Tomaticket into three different groups, taking into account their price and the number of days they are on the billboard.

Group 1, formed by those theaters whose offered shows have a relatively high average price and are also available on the billboard for more than 350 days.

Group 2, formed by theaters with the lowest prices and whose shows, on average, are available on the billboard for less than 300 days.

And finally, Group 3, which consists of theaters where the availability of shows is less than 300 days, but their prices are relatively high.

data_madrid <- data_madrid %>%
  mutate(Longitud = as.numeric(Longitud),
         Latitud = as.numeric(Latitud)) %>%
  na.omit()


map <- leaflet(data_madrid) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>%
  setView(lng = median(data_madrid$Longitud), lat = median(data_madrid$Latitud), zoom = 9)

color_pal <- colorFactor(
  palette = c( "green", "red"), 
  domain = data_madrid$prices  
)

map <- map %>% 
  addCircles(lng = ~Longitud, 
             lat = ~Latitud,
             color = ~color_pal(prices),
              popup = ~paste("Teatro:",theater_name, "<br>Precio:" ,prices))


map <- map %>% 
  addLegend(position = "bottomleft", pal = color_pal, values = ~prices, bins = 4) 
  



map

Regarding the location of theaters in Madrid whose shows are offered on Tomaticket, the most notable aspect is that, at least for those that Tomaticket provides location information for, all the theaters are located in the city center.

If we consider the relationship between location and price, we do not observe any notable pattern.